/* This dofile is to match follow-up survey outcome for grade 12 students with the current database */

// import dataset 
// most updated version on September 28
clear
import excel "$raw/student/followup survey grade 12/Follow-up_questions_on_final_decisions_-_Grade_12_-_all_versions_-_labels_-_2022-10-14-16-21-54", clear firstrow

sort _index
g indexnumber = _n

ren Studentcode studentid 
ren Schoolname schoolname 
ren Studentname name
ren CitizenshipIDNo citizenid
ren Didthestudentapplyforcol s_appcollege
ren stchoice s_appcollege_major1
ren ndchoice s_appcollege_major2
ren rdchoice s_appcollege_major3
ren ifothersPleasespecify s_appcollege_major_others

ren Wasthestudentadmittedto s_admcollege
ren Whichcollegemajorwasthe s_admcollege_major
ren S s_admcollege_major_others

ren Didthestudentapplyforte s_apptvet
ren U s_admtvet

ren Whatwasthestudentsfinal s_findecision
ren ifotherspleasespecify s_findecision_others
ren NameofthecollegeorTTII s_institute
ren IfOthersPleaseSpecify s_institute_others

rename *, lower   

// clean the survey data 
g s_final_tvet = regexm(s_findecision, "TTI|IZC")
lab var s_final_tvet "1/0 final decision is to continue TVET (survey data)"

g s_final_repeat = regexm(s_findecision, "Repeat")
lab var s_final_repeat "1/0 indicator for repeating grade 12 (survey data)"

g s_final_stop = regexm(s_findecision, "Repeat")
lab var s_final_stop "1/0 indicator for stop schooling (survey data)"

g s_app_tvet = regexm(s_apptvet, "Yes")
lab var s_app_tvet "1/0 application to TVET (survey data)"
g s_adm_tvet = regexm(s_admtvet, "Yes")
lab var s_adm_tvet "1/0 admission to TVET (survey data)"

g s_app_college = regexm(s_appcollege, "Yes")
lab var s_app_college "1/0 application to college (survey data)"

g s_adm_college = regexm(s_admcollege, "Yes")
lab var s_adm_college "1/0 admission to college (survey data)"

g s_appcol_sci = regexm(s_appcollege_major1, "Bachelors: Science, Math, Computer Science, Agriculture") | regexm(s_appcollege_major2,"Bachelors: Science, Math, Computer Science, Agriculture") | regexm(s_appcollege_major3,"Bachelors: Science, Math, Computer Science, Agriculture") 
lab var s_appcol_sci "1/0 indicator for applying to college - Bachelors: Science Math Computer Science Agriculture (survey data)"

g s_admcol_sci = regexm(s_admcollege_major,"Bachelors: Science, Math, Computer Science, Agriculture")
lab var s_admcol_sci "1/0 indicator for being accepted to college - Bachelors: Science Math Computer Science Agriculture (survey data)"

g s_appcol_eng = regexm(s_appcollege_major1,"Bachelors: Engineering") | regexm(s_appcollege_major2,"Bachelors: Engineering") | regexm(s_appcollege_major3,"Bachelors: Engineering") 
lab var s_appcol_eng "1/0 indicator for applying to college - Bachelors: Engineering (survey data)"

g s_admcol_eng = regexm(s_admcollege_major,"Bachelors: Engineering")
lab var s_admcol_eng "1/0 indicator for being accepted to college - Bachelors: Engineering (survey data)"

g s_appcol_med = regexm(s_appcollege_major1,"Bachelors: Medicine, Health, and Nursing") | regexm(s_appcollege_major2,"Bachelors: Medicine, Health, and Nursing") | regexm(s_appcollege_major3,"Bachelors: Medicine, Health, and Nursing") 
lab var s_appcol_med "1/0 indicator for applying to college - Bachelors: Medicine Health and Nursing (survey data)"

g s_admcol_med = regexm(s_admcollege_major,"Bachelors: Medicine, Health, and Nursing")
lab var s_admcol_med "1/0 indicator for being accepted to college - Bachelors: Medicine Health and Nursing (survey data)"

g s_appcol_edu = regexm(s_appcollege_major1,"Bachelors: Education") | regexm(s_appcollege_major2,"Bachelors: Education") | regexm(s_appcollege_major3,"Bachelors: Education") 
lab var s_appcol_edu "1/0 indicator for applying to college - Bachelors: Education (survey data)"

g s_admcol_edu = regexm(s_admcollege_major,"Bachelors: Education")
lab var s_admcol_edu "1/0 indicator for being accepted to college - Bachelors: Education (survey data)"

g s_appcol_bus = regexm(s_appcollege_major1,"Bachelors: Business, Management, Economics, and Law") | regexm(s_appcollege_major2,"Bachelors: Business, Management, Economics, and Law") | regexm(s_appcollege_major3,"Bachelors: Business, Management, Economics, and Law") 
lab var s_appcol_bus "1/0 indicator for applying to college - Bachelors: Business Management Economics and Law (survey data)"

g s_admcol_bus = regexm(s_admcollege_major,"Bachelors: Business, Management, Economics, and Law")
lab var s_admcol_bus "1/0 indicator for being accepted to college - Bachelors: Business Management Economics and Law (survey data)"

g s_appcol_art = regexm(s_appcollege_major1,"Bachelors: Arts, Humanities, and Social Studies") | regexm(s_appcollege_major2,"Bachelors: Arts, Humanities, and Social Studies") | regexm(s_appcollege_major3,"Bachelors: Arts, Humanities, and Social Studies") 
lab var s_appcol_art "1/0 indicator for applying to college - Bachelors: Arts Humanities and Social Studies (survey data)"

g s_admcol_art = regexm(s_admcollege_major,"Bachelors: Arts, Humanities, and Social Studies")
lab var s_admcol_art "1/0 indicator for being accepted to college - Bachelors: Arts Humanities and Social Studies (survey data)"

replace s_appcol_sci = 1 if regexm(s_appcollege_major_others, "Forest|Data|science|technology|Technology|Science|computer|biology")
replace s_appcol_eng = 1 if regexm(s_appcollege_major_others, "engineer")
replace s_appcol_art = 1 if regexm(s_appcollege_major_others, "English|community|psychology|journalism|Dzongkha|communication|service")
replace s_appcol_edu = 1 if regexm(s_appcollege_major_others, "education")
replace s_appcol_bus = 1 if regexm(s_appcollege_major_others, "law|business|accounting")
replace s_appcol_med = 1 if regexm(s_appcollege_major_others, "nursing|health|medicine")

replace citizenid = "" if length(citizenid)~=11
replace citizenid = "" if citizenid == "UNDER PROCE"
destring citizenid, replace

tostring citizenid, replace format("%20.0f")

replace schoolname = trim(itrim(lower(schoolname)))
tab schoolname

// compare schoolname with our database and fix those inconsistent
replace schoolname = "babesa higher secondary school" if regexm(schoolname, "babesa")
replace schoolname = "bjishong central school" if regexm(schoolname, "bjishong")
replace schoolname = "baylling central school" if regexm(schoolname, "baylling|bayling")
replace schoolname = "chukha central school" if regexm(schoolname, "chukha")
replace schoolname = "damphu central school" if regexm(schoolname, "damphu|dampnu")
replace schoolname = "desi higher secondary school" if regexm(schoolname, "desi")
replace schoolname = "gaselo central school" if regexm(schoolname, "gaselo")
replace schoolname = "jigme sherubling central school" if regexm(schoolname, "jigme")
replace schoolname = "kelki higher secondary school" if regexm(schoolname, "kelki")
replace schoolname = "motithang higher secondary school" if regexm(schoolname, "motithang|mothithang")
replace schoolname = "pelkhil higher secondary school" if regexm(schoolname, "pelkhil")
replace schoolname = "pakshika higher secondary school" if regexm(schoolname, "pakshika|pakshikha")
replace schoolname = "punakha central school" if regexm(schoolname, "phunakha")
replace schoolname = "shaba higher secondary school" if regexm(schoolname, "shaba")
replace schoolname = "tsenkharla central school" if regexm(schoolname, "tsenkharla")
replace schoolname = "wangbama central school" if regexm(schoolname, "wangbama")
replace schoolname = "yangchenphug higher secondary school" if regexm(schoolname, "yangchenphu|yangchenphug") 
replace schoolname = "yoezerling higher secondary school" if regexm(schoolname, "yoerzerling|yoezerling")
replace schoolname = "" if regexm(schoolname, "bachelors") 

replace name = "Jigme singye" if studentid == "Jigme singye" & name == "201.00380.11.0079"
replace studentid = "201.00380.11.0079" if name == "Jigme singye"  & citizenid == "11601002048"

replace name = "sonam Peldon" if studentid == "sonam Peldon" & name == "201.00398.11.0038"
replace studentid = "201.00398.11.0038" if studentid == "sonam Peldon" & citizenid == "11607001647"

replace name = trim(itrim(lower(name)))
replace name = subinstr(name," ","",.)  
replace name = subinstr(name,",","",.)   //Removes comma (,)
replace name = subinstr(name,"'","",.)   //Removes apostrophe (')
replace name = subinstr(name,".","",.)   //Removes dot (.) 
replace name = subinstr(name,"/","",.)   //Removes slash (/)
replace name = subinstr(name,"-","",.)   //Removes dash (-)
replace name = subinstr(name,"=","",.)   //Removes dash (-)
replace name = subinstr(name,"(","",.)   //Removes opening parentheses
replace name = subinstr(name,")","",.)   //Removes closing parentheses


gen dateofbirth1 = date(dateofbirth, "DMY")
format dateofbirth1 %td
g b_month = month(dateofbirth1)
g b_day = day(dateofbirth1)
g b_year = year(dateofbirth1)

g sex = 1 if gender == "Male"
replace sex = 2 if gender == "Female"
replace sex = 3 if gender == "Other "

drop dateofbirth dateofbirth1

g double dateofbirth = b_year*10^4 + b_month*10^2 + b_day
format dateofbirth %20.0f 
tostring dateofbirth, replace 

// correct for typos: this is done after finishing step 3, then manually check 

replace studentid = "204.00016.12.0002" if studentid == "204.00016.12.002" & name == "chatrabhadurchhetri"
replace studentid = "201.00059.11.0034" if studentid == "201.00059.11 0034" & name == "kuensumkuendreldrelma"
replace studentid = "201.00045.11.0055" if studentid == "201.00045.11.0054" & name == "phuntshozangmo"
replace studentid = "201.00029.11.0109" if studentid == "201.00029.11 0109" & name == "sangitarai"
replace studentid = "201.00403.11.0112" if studentid == "201.00403.11.00112"
replace studentid = "201.00452.11.0317" if studentid == "201.00049.11.0026" & sex == 1 & dateofbirth == "20030314" & name == "sonamyoezer"

replace studentid = "201.00315.11.0100" if studentid == "201.00315.11.01000" & name == "dechenpenjor"
replace citizenid = "10705005142" if studentid == "201.00269.11.0138" & citizenid == "10705005141"
replace studentid = "201.00010.11.3615" if studentid == "201.00010.113615" & name == "snehasubba"
replace studentid = "201.00154.11.0028" if studentid == "201.00154.110028"
replace studentid = "202.00017.11.0127" if studentid == "202.00017.0127"
replace studentid = "201.00296.11.0181" if studentid == "201.00396.11.0181" & name == "nirutalepcha"

replace studentid = "201.00296.11.0127" if studentid == "201.00396.11.01127" & name == "pemadechen"
replace studentid = "201.00288.11.0049" if studentid == "23.2.2002" & name == "sarasatidarjee"
replace studentid = "201.00312.11.0205" if studentid == "201.00312.11.9205" & name == "tandintshering"

replace citizenid = "11805000979" if studentid == "201.00436.11.0078" & citizenid == "11805000797"
replace studentid = "201.00524.11.0106" if studentid == "201.00436.11.0078" & citizenid == "11805002000"

replace studentid = "201.00301.11.0264" if studentid == "201..00301.11.0264" & name == "tsheringchodenghalley"

replace studentid = "201.00286.11.0326" if studentid == "201.00386.11.0326" & name == "namgayrinchen"
replace studentid = "201.00306.11.0065" if name == "ngawangnorbu" & schoolname == "pelkhil higher secondary school" & dateofbirth == "20020924"

replace citizenid = "11008000387" if studentid == "201.00457.11.0051" & name == "tashilhamo" & dateofbirth == "20040722"

// remove duplicated observations by keeping the later submission
sort schoolname name studentid sex 
g dup = 0
forval i=1/3 {
replace dup = 1 if schoolname == schoolname[_n+`i'] & name == name[_n+`i'] & sex == sex[_n+`i'] & studentid==studentid[_n+`i']
}

// save the list of duplicated observations and send to Cheku
preserve 
duplicates tag schoolname name sex studentid, g(duplicatedobservations)
tab duplicatedobservations
keep if duplicatedobservations ~= 0
keep start end enumeratorid schoolname name studentid citizenid gender phonenumber
export excel using "$temp/File1_grade12_degree_data_duplicatedobservations.xlsx", replace firstrow(variables)
restore 

keep if dup == 0

keep indexnumber name studentid citizenid b_* dateofbirth sex schoolname s_*
foreach variable in schoolname name studentid citizenid dateofbirth sex b_month b_day b_year{
	rename `variable' `variable'_s 
}

save "$temp/tvetchoice.dta", replace 


********************************************************************************
*
*			MERGE WITH GRADE 12 SURVEY DATA 
*
********************************************************************************

// STEP 1: PERFECT MATCHING ON STUDENT ID + DOUBLE CHECK NAME 
********************************************************************************
gl criteria studentid  

* open the tvet choice data 
use "$temp/tvetchoice.dta", clear 
rename studentid_s studentid 

duplicates tag $criteria, g(temp)
tab temp

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/tvet_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename studentid studentid_s
save $temp/tvet_step1d, replace


* open the survey data 
use "$temp/tvet_analysis.dta", clear
replace name = subinstr(name," ","",.)  
replace sex = 2 if sex == 0 
format studentid_bl %20s

generate str citizenid_st = citizenid
replace citizenid = ""
compress citizenid
replace citizenid = citizenid_st
drop citizenid_st
describe citizenid

save "$temp/data12.dta", replace 


use "$temp/data12.dta", clear
rename studentid_bl studentid 

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data12_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
rename studentid studentid_bl 
save $temp/data12_step1d, replace

* merging two unique data together using fully matched student ID & check fuzzy name 
use $temp/tvet_step1u, clear
merge 1:1 $criteria using $temp/data12_step1u
matchit name_s name, g(namescore)
matchit dateofbirth_s dateofbirth, g(bscore)
sum namescore if _merge==3

* correct if names are not matched, leave for the next round of matching
replace _merge = . if (namescore <= 0.5 & bscore < 0.8) & _merge == 3
replace _merge = . if studentid == "201.00323.11.0147"

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid 
unique indexnumber
save $temp/tvet_step1m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/tvetchoice.dta, keep(matched) nogen 
append using $temp/tvet_step1d
save $temp/tvet_step2b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step1d
save $temp/data12_step2b, replace 
restore 


// STEP 2: PERFECT MATCHING ON CITIZEN ID + DOUBLE CHECK NAME 
********************************************************************************
gl criteria citizenid 

* open the tvet choice data 
use $temp/tvet_step2b.dta, clear 
rename citizenid_s citizenid 

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/tvet_step2u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename citizenid citizenid_s
save $temp/tvet_step2d, replace


* open the survey data 
use $temp/data12_step2b.dta, clear

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data12_step2u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
save $temp/data12_step2d, replace


* merging two unique data together using fully matched citizenship ID & check fuzzy name 
use $temp/tvet_step2u
merge 1:1 $criteria using $temp/data12_step2u
matchit name_s name, g(namescore)
matchit dateofbirth_s dateofbirth, g(bscore)

sum namescore if _merge==3

replace _merge = . if namescore < 0.7 & bscore < 0.9 & _merge == 3

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid 
unique indexnumber 
save $temp/tvet_step2m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/tvetchoice.dta, keep(matched) nogen 
append using $temp/tvet_step2d
save $temp/tvet_step3b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step2d
save $temp/data12_step3b, replace 
restore  



// STEP 3: PERFECT MATCHING ON SEX + DATE OF BIRTH
********************************************************************************
gl criteria sex dateofbirth    

* open the tvet choice data 
use $temp/tvet_step3b.dta, clear 

rename dateofbirth_s dateofbirth 
rename sex_s sex

duplicates tag $criteria, g(temp)
tab temp

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/tvet_step3u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap ren dateofbirth dateofbirth_s
cap ren sex sex_s
save $temp/tvet_step3d, replace


* open the survey data 
use $temp/data12_step3b.dta, clear

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data12_step3u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
save $temp/data12_step3d, replace


* merging two unique data together 
use $temp/tvet_step3u, clear
merge 1:1 $criteria using $temp/data12_step3u
matchit name_s name, g(namescore)
matchit studentid_s studentid_bl, g(idscore)
matchit citizenid_s citizenid, g(cidscore)

*br name name_s *score if  _merge==3
sort name name_s

* correct if names are not matched, leave for the next round of matching
replace _merge = . if citizenid_s == "10204001407" & studentid_s == "201.00031.11.1234"
replace _merge = . if citizenid_s == "." & studentid_s == "201.00451.0048" 

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid
unique indexnumber  
save $temp/tvet_step3m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/tvetchoice.dta, keep(matched) nogen 
append using $temp/tvet_step3d
save $temp/tvet_step4b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data12.dta, keep(matched) nogen 
append using $temp/data12_step3d
save $temp/data12_step4b, replace 
restore  


// combine data together 
clear 
append using $temp/tvet_step1m
g s_tvet_match_criteria = "perfect(studentID) + fuzzy(name/dob)"

append using $temp/tvet_step2m
replace s_tvet_match_criteria = "perfect(CID) + fuzzy(name/dob)" if missing(s_tvet_match_criteria)

append using $temp/tvet_step3m
replace s_tvet_match_criteria = "perfect(dob&sex) + fuzzy(studentID|CID|name)" if missing(s_tvet_match_criteria)
count

tab s_tvet_match_criteria

merge 1:1 indexnumber using "$temp/tvetchoice.dta", keep(matched) keepusing(s_*) nogen
drop indexnumber
g s_survey_data = 1 
lab var s_survey_data "follow-up survey data on application/final decisions available"

save "$temp/grade12_analysis_survey.dta", replace 





